Lab 8

Author

Ben Kessler

Published

December 4, 2023

Lab 8: Linear Classifiers

import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import f1_score, confusion_matrix
data = pd.read_csv("/Users/ben/Documents/GitHub/DSML/Data/cannabis_full.csv")
data = data.dropna()
data.head()
Strain Type Rating Effects Flavor Creative Energetic Tingly Euphoric Relaxed ... Ammonia Minty Tree Fruit Butter Pineapple Tar Rose Plum Pear
0 100-Og hybrid 4.0 Creative,Energetic,Tingly,Euphoric,Relaxed Earthy,Sweet,Citrus 1.0 1.0 1.0 1.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 98-White-Widow hybrid 4.7 Relaxed,Aroused,Creative,Happy,Energetic Flowery,Violet,Diesel 1.0 1.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 1024 sativa 4.4 Uplifted,Happy,Relaxed,Energetic,Creative Spicy/Herbal,Sage,Woody 1.0 1.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 13-Dawgs hybrid 4.2 Tingly,Creative,Hungry,Relaxed,Uplifted Apricot,Citrus,Grapefruit 1.0 0.0 1.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 24K-Gold hybrid 4.6 Happy,Relaxed,Euphoric,Uplifted,Talkative Citrus,Earthy,Orange 0.0 0.0 0.0 1.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 69 columns

Part One: Binary Classification

is_data = data[(data["Type"] == "sativa" )| (data["Type"] == "indica")]
is_data.head()
Strain Type Rating Effects Flavor Creative Energetic Tingly Euphoric Relaxed ... Ammonia Minty Tree Fruit Butter Pineapple Tar Rose Plum Pear
2 1024 sativa 4.4 Uplifted,Happy,Relaxed,Energetic,Creative Spicy/Herbal,Sage,Woody 1.0 1.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
5 3-Bears-Og indica 0.0 None None 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
7 303-Og indica 4.2 Relaxed,Happy,Euphoric,Uplifted,Giggly Citrus,Pungent,Earthy 0.0 0.0 0.0 1.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
8 3D-Cbd sativa 4.6 Uplifted,Focused,Happy,Talkative,Relaxed Earthy,Woody,Flowery 0.0 0.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
9 3X-Crazy indica 4.4 Relaxed,Tingly,Happy,Euphoric,Uplifted Earthy,Grape,Sweet 0.0 0.0 1.0 1.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 69 columns

X = is_data.drop(["Strain", "Type", "Effects", "Flavor"], axis = 1)
y = is_data["Type"]
y = LabelEncoder().fit_transform(y)

Q1: LDA

As this is a binary classification problem, I believe a good metric to use would be F1-score. This metric considers both precision and recall, making it a good choice when classes are imbalanced.

ct = ColumnTransformer(
    [
        ("standardize", StandardScaler(), make_column_selector(dtype_include=np.number))
    ]
)

my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("LDA", LinearDiscriminantAnalysis(solver = "eigen"))
])

alphas = {"LDA__shrinkage": [.0001, .001, .01, .1, 1]}

gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_
Pipeline(steps=[('Preprocessing',
                 ColumnTransformer(transformers=[('standardize',
                                                  StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x15e82b290>)])),
                ('LDA',
                 LinearDiscriminantAnalysis(shrinkage=1, solver='eigen'))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is better than random guesses.")
Average F1 Score across 25 cross validations: 0.7895754166409029
This is a score that is better than random guesses.
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("LDA", LinearDiscriminantAnalysis(solver = "eigen", shrinkage = 1))
])

fitted_pipeline = my_pipeline.fit(X, y)

y_pred = fitted_pipeline.predict(X)

cm = confusion_matrix(y_true = y, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=['Actual Indica', 'Actual Sativa'], columns=['Predicted Indica', 'Predicted Sativa'])

cm_df
Predicted Indica Predicted Sativa
Actual Indica 605 82
Actual Sativa 73 358

Q2: QDA

As this is a binary classification problem, I believe a good metric to use would be F1-score. This metric considers both precision and recall, making it a good choice when classes are imbalanced.

my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("QDA", QuadraticDiscriminantAnalysis())
])

alphas = {"QDA__reg_param": [0, .0001, .001, .01, .1, 1]}

gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_
Pipeline(steps=[('Preprocessing',
                 ColumnTransformer(transformers=[('standardize',
                                                  StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x15d9eec10>)])),
                ('QDA', QuadraticDiscriminantAnalysis(reg_param=1))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is slightly better than random guesses.")
Average F1 Score across 30 cross validations: 0.67734658323323
This is a score that is slightly better than random guesses.
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("QDA", QuadraticDiscriminantAnalysis(reg_param = 1))
])

fitted_pipeline = my_pipeline.fit(X, y)

y_pred = fitted_pipeline.predict(X)

cm = confusion_matrix(y_true = y, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=['Actual Indica', 'Actual Sativa'], columns=['Predicted Indica', 'Predicted Sativa'])

cm_df
Predicted Indica Predicted Sativa
Actual Indica 607 80
Actual Sativa 73 358

Q3: SVC

As this is a binary classification problem, I believe a good metric to use would be F1-score. This metric considers both precision and recall, making it a good choice when classes are imbalanced.

my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("SVC", SVC())
])

alphas = {"SVC__C": [.1, .5, 1, 10, 100]}

gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_
Pipeline(steps=[('Preprocessing',
                 ColumnTransformer(transformers=[('standardize',
                                                  StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x15e8c1bd0>)])),
                ('SVC', SVC(C=10))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is better than random guesses.")
Average F1 Score across 25 cross validations: 0.7222126650132197
This is a score that is better than random guesses.
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("SVC", SVC(C = 10))
])

fitted_pipeline = my_pipeline.fit(X, y)

y_pred = fitted_pipeline.predict(X)

cm = confusion_matrix(y_true = y, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=['Actual Indica', 'Actual Sativa'], columns=['Predicted Indica', 'Predicted Sativa'])

cm_df
Predicted Indica Predicted Sativa
Actual Indica 676 11
Actual Sativa 26 405

Q4: SVM

As this is a binary classification problem, I believe a good metric to use would be F1-score. This metric considers both precision and recall, making it a good choice when classes are imbalanced.

my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("SVM", SVC(kernel="poly"))
])

alphas = {"SVM__C": [.1, 1, 10, 100, 1000], "SVM__degree": list(range(1, 11))}

gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_
Pipeline(steps=[('Preprocessing',
                 ColumnTransformer(transformers=[('standardize',
                                                  StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x1a32842d0>)])),
                ('SVM', SVC(C=100, degree=1, kernel='poly'))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is about the slightly worse than random guesses.")
Average F1 Score across 250 cross validations: 0.42109205230492136
This is a score that is about the slightly worse than random guesses.
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("SVM", SVC(kernel = "poly", C = 100, degree = 1))
])

fitted_pipeline = my_pipeline.fit(X, y)

y_pred = fitted_pipeline.predict(X)

cm = confusion_matrix(y_true = y, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=['Actual Indica', 'Actual Sativa'], columns=['Predicted Indica', 'Predicted Sativa'])

cm_df
Predicted Indica Predicted Sativa
Actual Indica 633 54
Actual Sativa 77 354

Part Two: Natural Multiclass

X = data.drop(["Strain", "Type", "Effects", "Flavor"], axis = 1)
y = data["Type"]
y = LabelEncoder().fit_transform(y)

Q1: Decison Tree

my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("DTree", DecisionTreeClassifier())
])

alphas = {"DTree__ccp_alpha": [0, .00001, .0001, .001, .01, .1, 1, 10]}

gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1_macro')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_
Pipeline(steps=[('Preprocessing',
                 ColumnTransformer(transformers=[('standardize',
                                                  StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x168287090>)])),
                ('DTree', DecisionTreeClassifier(ccp_alpha=0.001))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is slightly worse than random guesses.")
Average F1 Score across 40 cross validations: 0.3854219815972776
This is a score that is slightly worse than random guesses.
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("DTree", DecisionTreeClassifier(ccp_alpha=.001))
])

fitted_pipeline = my_pipeline.fit(X, y)

y_pred = fitted_pipeline.predict(X)

cm = confusion_matrix(y_true = y, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=['Actual Hybrid', 'Actual Indica', 'Actual Sativa'], columns=['Predicted Hybrid', 'Predicted Indica', 'Predicted Sativa'])

cm_df
Predicted Hybrid Predicted Indica Predicted Sativa
Actual Hybrid 887 178 122
Actual Indica 242 433 12
Actual Sativa 212 18 201
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
plt.figure(figsize=(200,100))
plot_tree(DecisionTreeClassifier(ccp_alpha=.001).fit(X, y), filled=True, feature_names=list(X.columns), class_names=["Hybrid", "Indica", "Sativa"])
plt.show()

Examining the decision tree provides a very interesting way of understanding how the model is attempting to classify. To begin it discovered that the most common difference between sativa and indica is that one is more likely to make a person sleepy than the other. This initial split results in only 28 sativa strains being left on the right side of the tree, with the rest being pushed to the left side of the tree. From here the logic of the tree changes for the two sides. On the right side of the tree, the model attempts to use flavors to classify, while on the left side of the tree the model focuses on how the strain makes people feel. This methodology seems to allow the model to weed out the sativa strains and focus on the classification between indica and hybrid.

Q2: LDA, QDA, and KNN

# LDA
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("LDA", LinearDiscriminantAnalysis(solver = "eigen"))
])

alphas = {"LDA__shrinkage": [.0001, .001, .01, .1, 1]}

gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1_macro')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_
Pipeline(steps=[('Preprocessing',
                 ColumnTransformer(transformers=[('standardize',
                                                  StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x16825f310>)])),
                ('LDA',
                 LinearDiscriminantAnalysis(shrinkage=0.1, solver='eigen'))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is slightly better than random guesses.")
Average F1 Score across 25 cross validations: 0.5942931559991004
This is a score that is slightly better than random guesses.
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("LDA", LinearDiscriminantAnalysis(solver = "eigen", shrinkage = .1))
])

fitted_pipeline = my_pipeline.fit(X, y)

y_pred = fitted_pipeline.predict(X)

cm = confusion_matrix(y_true = y, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=['Actual Hybrid', 'Actual Indica', 'Actual Sativa'], columns=['Predicted Hybrid', 'Predicted Indica', 'Predicted Sativa'])

cm_df
Predicted Hybrid Predicted Indica Predicted Sativa
Actual Hybrid 835 206 146
Actual Indica 207 468 12
Actual Sativa 222 20 189
# QDA
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("QDA", QuadraticDiscriminantAnalysis())
])

alphas = {"QDA__reg_param": [0, .0001, .001, .01, .1, 1]}

gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1_macro')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_
Pipeline(steps=[('Preprocessing',
                 ColumnTransformer(transformers=[('standardize',
                                                  StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x1a2e53d50>)])),
                ('QDA', QuadraticDiscriminantAnalysis(reg_param=1))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is worse than random guesses.")
Average F1 Score across 30 cross validations: 0.39560132926837605
This is a score that is worse than random guesses.
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("QDA", QuadraticDiscriminantAnalysis(reg_param = 1))
])

fitted_pipeline = my_pipeline.fit(X, y)

y_pred = fitted_pipeline.predict(X)

cm = confusion_matrix(y_true = y, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=['Actual Hybrid', 'Actual Indica', 'Actual Sativa'], columns=['Predicted Hybrid', 'Predicted Indica', 'Predicted Sativa'])

cm_df
Predicted Hybrid Predicted Indica Predicted Sativa
Actual Hybrid 796 213 178
Actual Indica 198 469 20
Actual Sativa 197 18 216
# KNN
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("KNN", KNeighborsClassifier())
])

alphas = {"KNN__n_neighbors": list(range(1, 50))}

gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1_macro')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_
Pipeline(steps=[('Preprocessing',
                 ColumnTransformer(transformers=[('standardize',
                                                  StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x1671dbbd0>)])),
                ('KNN', KNeighborsClassifier(n_neighbors=3))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is slightly worse than random guesses.")
Average F1 Score across 245 cross validations: 0.4400240703580052
This is a score that is slightly worse than random guesses.
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("KNN", KNeighborsClassifier(n_neighbors=3))
])

fitted_pipeline = my_pipeline.fit(X, y)

y_pred = fitted_pipeline.predict(X)

cm = confusion_matrix(y_true = y, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=['Actual Hybrid', 'Actual Indica', 'Actual Sativa'], columns=['Predicted Hybrid', 'Predicted Indica', 'Predicted Sativa'])

cm_df
Predicted Hybrid Predicted Indica Predicted Sativa
Actual Hybrid 1003 117 67
Actual Indica 215 464 8
Actual Sativa 180 18 233

Q3

My metrics were significantly worse than in part one. This is because adding a third, difficult to distinguish category resulted in the model having lower accuracy and being confused more often about which type each strain belonged to. The category that was most likely to get mixed up according to the confusion matrices was Hybrid. The models often correctly identified the hybrid strains, but also would overguess and incorrectly categorize many indica and sativa strains as hybrid. This is because hybrid strains share many characteristics with both sativa and indica.

Part Three: Multiclass from Binary

Q1

# Indica vs. Not Indica
y = np.where(data["Type"] == "indica", 1, 0)
# SVC
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("SVC", SVC())
])

alphas = {"SVC__C": [.1, .5, 1, 10, 100]}

gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_
Pipeline(steps=[('Preprocessing',
                 ColumnTransformer(transformers=[('standardize',
                                                  StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x16819acd0>)])),
                ('SVC', SVC(C=1))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is slightly better than random guesses.")
Average F1 Score across 25 cross validations: 0.53992997672384
This is a score that is slightly better than random guesses.
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("SVC", SVC(C = 1))
])

fitted_pipeline = my_pipeline.fit(X, y)

y_pred = fitted_pipeline.predict(X)

cm = confusion_matrix(y_true = y, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=['Actual Other', 'Actual Indica'], columns=['Predicted Other', 'Predicted Indica'])

cm_df
Predicted Other Predicted Indica
Actual Other 1461 157
Actual Indica 216 471
# Logistic Regression
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("LogReg", LogisticRegression(solver = "saga", max_iter = 1000))
])

alphas = {
    "LogReg__penalty": ["elasticnet"],
    "LogReg__l1_ratio": [.0001, .001, .01, .1, 1],
    "LogReg__C": list(range(1, 10))
}

gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_
Pipeline(steps=[('Preprocessing',
                 ColumnTransformer(transformers=[('standardize',
                                                  StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x166edad90>)])),
                ('LogReg',
                 LogisticRegression(C=1, l1_ratio=1, max_iter=1000,
                                    penalty='elasticnet', solver='saga'))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is better than random guesses.")
Average F1 Score across 225 cross validations: 0.6284981314081471
This is a score that is better than random guesses.
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("LogReg", LogisticRegression(solver = "saga", max_iter = 1000, penalty = "elasticnet", C = 1, l1_ratio = 1))
])

fitted_pipeline = my_pipeline.fit(X, y)

y_pred = fitted_pipeline.predict(X)

cm = confusion_matrix(y_true = y, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=['Actual Other', 'Actual Indica'], columns=['Predicted Other', 'Predicted Indica'])

cm_df
Predicted Other Predicted Indica
Actual Other 1435 183
Actual Indica 257 430
# Sativa vs. Not Sativa
y = np.where(data["Type"] == "sativa", 1, 0)
# SVC
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("SVC", SVC())
])

alphas = {"SVC__C": [.1, .5, 1, 10, 100]}

gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_
Pipeline(steps=[('Preprocessing',
                 ColumnTransformer(transformers=[('standardize',
                                                  StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x165f26d90>)])),
                ('SVC', SVC(C=10))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is significantly worse than random guesses.")
Average F1 Score across 25 cross validations: 0.1920921810596639
This is a score that is significantly worse than random guesses.
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("SVC", SVC(C = 10))
])

fitted_pipeline = my_pipeline.fit(X, y)

y_pred = fitted_pipeline.predict(X)

cm = confusion_matrix(y_true = y, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=['Actual Other', 'Actual Sativa'], columns=['Predicted Other', 'Predicted Sativa'])

cm_df
Predicted Other Predicted Sativa
Actual Other 1843 31
Actual Sativa 96 335
# Logistic Regression
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("LogReg", LogisticRegression(solver = "saga", max_iter = 2500))
])

alphas = {
    "LogReg__penalty": ["elasticnet"],
    "LogReg__l1_ratio": [.0001, .001, .01, .1, 1],
    "LogReg__C": list(range(1, 10))
}

gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_
Pipeline(steps=[('Preprocessing',
                 ColumnTransformer(transformers=[('standardize',
                                                  StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x167feba90>)])),
                ('LogReg',
                 LogisticRegression(C=1, l1_ratio=0.1, max_iter=2500,
                                    penalty='elasticnet', solver='saga'))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is slightly worse than random guesses.")
Average F1 Score across 225 cross validations: 0.39862439717664777
This is a score that is slightly worse than random guesses.
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("LogReg", LogisticRegression(solver = "saga", max_iter = 2500, penalty = "elasticnet", C = 1, l1_ratio = .1))
])

fitted_pipeline = my_pipeline.fit(X, y)

y_pred = fitted_pipeline.predict(X)

cm = confusion_matrix(y_true = y, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=['Actual Other', 'Actual Sativa'], columns=['Predicted Other', 'Predicted Sativa'])

cm_df
Predicted Other Predicted Sativa
Actual Other 1781 93
Actual Sativa 286 145
# Hybrid vs. Not Hybrid
y = np.where(data["Type"] == "hybrid", 1, 0)
# SVC
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("SVC", SVC())
])

alphas = {"SVC__C": [.1, .5, 1, 10, 100]}

gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_
Pipeline(steps=[('Preprocessing',
                 ColumnTransformer(transformers=[('standardize',
                                                  StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x167350150>)])),
                ('SVC', SVC(C=0.1))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is slightly better than random guesses.")
Average F1 Score across 25 cross validations: 0.6269445895255663
This is a score that is slightly better than random guesses.
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("SVC", SVC(C = .1))
])

fitted_pipeline = my_pipeline.fit(X, y)

y_pred = fitted_pipeline.predict(X)

cm = confusion_matrix(y_true = y, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=['Actual Other', 'Actual Hybrid'], columns=['Predicted Other', 'Predicted Hybrid'])

cm_df
Predicted Other Predicted Hybrid
Actual Other 441 677
Actual Hybrid 170 1017
# Logistic Regression
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("LogReg", LogisticRegression(solver = "saga", max_iter = 2500))
])

alphas = {
    "LogReg__penalty": ["elasticnet"],
    "LogReg__l1_ratio": [.0001, .001, .01, .1, 1],
    "LogReg__C": list(range(1, 10))
}

gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_
Pipeline(steps=[('Preprocessing',
                 ColumnTransformer(transformers=[('standardize',
                                                  StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x168a6d750>)])),
                ('LogReg',
                 LogisticRegression(C=1, l1_ratio=1, max_iter=2500,
                                    penalty='elasticnet', solver='saga'))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is slightly better than random guesses.")
Average F1 Score across 225 cross validations: 0.6380025018259865
This is a score that is slightly better than random guesses.
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("LogReg", LogisticRegression(solver = "saga", max_iter = 2500, penalty = "elasticnet", C = 1, l1_ratio = 1))
])

fitted_pipeline = my_pipeline.fit(X, y)

y_pred = fitted_pipeline.predict(X)

cm = confusion_matrix(y_true = y, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=['Actual Other', 'Actual Hybrid'], columns=['Predicted Other', 'Predicted Hybrid'])

cm_df
Predicted Other Predicted Hybrid
Actual Other 651 467
Actual Hybrid 343 844

Q2

The model that did the best job distinguishing the target category from the rest was the SVC model for hybrid vs other. The model that did the worst at distinguishing the target category from the rest was the logistic regression model for hybrid vs other. This makes intuitive sense because hybrid strains are the most difficult to distinguish and that is where the models struggled the most often, even if the best model did a good job of correctly identifying the hybrid strains.

Q3

is_data = data[(data["Type"] == "sativa" )| (data["Type"] == "indica")]
ih_data = data[(data["Type"] == "hybrid" )| (data["Type"] == "indica")]
hs_data = data[(data["Type"] == "hybrid" )| (data["Type"] == "sativa")]
# Indica vs. Sativa
X = is_data.drop(["Strain", "Type", "Effects", "Flavor"], axis = 1)
y = is_data["Type"]
y = LabelEncoder().fit_transform(y)
# SVC
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("SVC", SVC())
])

alphas = {"SVC__C": [.1, .5, 1, 10, 100]}

gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_
Pipeline(steps=[('Preprocessing',
                 ColumnTransformer(transformers=[('standardize',
                                                  StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x1a2ec04d0>)])),
                ('SVC', SVC(C=10))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is better than random guesses.")
Average F1 Score across 25 cross validations: 0.7222126650132197
This is a score that is better than random guesses.
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("SVC", SVC(C = 10))
])

fitted_pipeline = my_pipeline.fit(X, y)

y_pred = fitted_pipeline.predict(X)

cm = confusion_matrix(y_true = y, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=['Actual Indica', 'Actual Sativa'], columns=['Predicted Indica', 'Predicted Sativa'])

cm_df
Predicted Indica Predicted Sativa
Actual Indica 676 11
Actual Sativa 26 405
# Logistic Regression
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("LogReg", LogisticRegression(solver = "saga", max_iter = 2500))
])

alphas = {
    "LogReg__penalty": ["elasticnet"],
    "LogReg__l1_ratio": [.0001, .001, .01, .1, 1],
    "LogReg__C": list(range(1, 10))
}

gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_
Pipeline(steps=[('Preprocessing',
                 ColumnTransformer(transformers=[('standardize',
                                                  StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x167f63650>)])),
                ('LogReg',
                 LogisticRegression(C=2, l1_ratio=1, max_iter=2500,
                                    penalty='elasticnet', solver='saga'))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is better than random guesses.")
Average F1 Score across 225 cross validations: 0.7868439457144402
This is a score that is better than random guesses.
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("LogReg", LogisticRegression(solver = "saga", max_iter = 2500, penalty = "elasticnet", C = 2, l1_ratio = 1))
])

fitted_pipeline = my_pipeline.fit(X, y)

y_pred = fitted_pipeline.predict(X)

cm = confusion_matrix(y_true = y, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=['Actual Indica', 'Actual Sativa'], columns=['Predicted Indica', 'Predicted Sativa'])

cm_df
Predicted Indica Predicted Sativa
Actual Indica 620 67
Actual Sativa 75 356
# Indica vs. Hybrid
X = ih_data.drop(["Strain", "Type", "Effects", "Flavor"], axis = 1)
y = ih_data["Type"]
y = LabelEncoder().fit_transform(y)
# SVC
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("SVC", SVC())
])

alphas = {"SVC__C": [.1, .5, 1, 10, 100]}

gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_
Pipeline(steps=[('Preprocessing',
                 ColumnTransformer(transformers=[('standardize',
                                                  StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x168a6ea50>)])),
                ('SVC', SVC(C=1))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is slightly better than random guesses.")
Average F1 Score across 25 cross validations: 0.5707609076506064
This is a score that is slightly better than random guesses.
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("SVC", SVC(C = 1))
])

fitted_pipeline = my_pipeline.fit(X, y)

y_pred = fitted_pipeline.predict(X)

cm = confusion_matrix(y_true = y, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=['Actual Hybrid', 'Actual Indica'], columns=['Predicted Hybrid', 'Predicted Indica'])

cm_df
Predicted Hybrid Predicted Indica
Actual Hybrid 1029 158
Actual Indica 198 489
# Logistic Regression
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("LogReg", LogisticRegression(solver = "saga", max_iter = 2500))
])

alphas = {
    "LogReg__penalty": ["elasticnet"],
    "LogReg__l1_ratio": [.0001, .001, .01, .1, 1],
    "LogReg__C": list(range(1, 10))
}

gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_
Pipeline(steps=[('Preprocessing',
                 ColumnTransformer(transformers=[('standardize',
                                                  StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x166f3ccd0>)])),
                ('LogReg',
                 LogisticRegression(C=1, l1_ratio=1, max_iter=2500,
                                    penalty='elasticnet', solver='saga'))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is slightly better than random guesses.")
Average F1 Score across 225 cross validations: 0.6434071452695131
This is a score that is slightly better than random guesses.
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("LogReg", LogisticRegression(solver = "saga", max_iter = 2500, penalty = "elasticnet", C = 1, l1_ratio = 1))
])

fitted_pipeline = my_pipeline.fit(X, y)

y_pred = fitted_pipeline.predict(X)

cm = confusion_matrix(y_true = y, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=['Actual Hybrid', 'Actual Indica'], columns=['Predicted Hybrid', 'Predicted Indica'])

cm_df
Predicted Hybrid Predicted Indica
Actual Hybrid 998 189
Actual Indica 238 449
# Hybrid vs. Sativa
X = hs_data.drop(["Strain", "Type", "Effects", "Flavor"], axis = 1)
y = hs_data["Type"]
y = LabelEncoder().fit_transform(y)
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("SVC", SVC())
])

alphas = {"SVC__C": [.1, .5, 1, 10, 100]}

gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_
Pipeline(steps=[('Preprocessing',
                 ColumnTransformer(transformers=[('standardize',
                                                  StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x167dcaf50>)])),
                ('SVC', SVC(C=10))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is significantly worse than random guesses.")
Average F1 Score across 25 cross validations: 0.22301352303799224
This is a score that is significantly worse than random guesses.
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("SVC", SVC(C = 10))
])

fitted_pipeline = my_pipeline.fit(X, y)

y_pred = fitted_pipeline.predict(X)

cm = confusion_matrix(y_true = y, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=['Actual Hybrid', 'Actual Sativa'], columns=['Predicted Hybrid', 'Predicted Sativa'])

cm_df
Predicted Hybrid Predicted Sativa
Actual Hybrid 1156 31
Actual Sativa 93 338
# Logistic Regression
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("LogReg", LogisticRegression(solver = "saga", max_iter = 2500))
])

alphas = {
    "LogReg__penalty": ["elasticnet"],
    "LogReg__l1_ratio": [.0001, .001, .01, .1, 1],
    "LogReg__C": list(range(1, 10))
}

gscv = GridSearchCV(my_pipeline, alphas, cv = 5, scoring='f1')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_
Pipeline(steps=[('Preprocessing',
                 ColumnTransformer(transformers=[('standardize',
                                                  StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x167f69c10>)])),
                ('LogReg',
                 LogisticRegression(C=9, l1_ratio=0.0001, max_iter=2500,
                                    penalty='elasticnet', solver='saga'))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
print(f"Average F1 Score across {5*len(test_scores)} cross validations: {np.mean(test_scores)}\nThis is a score that is slightly worse than random guesses.")
Average F1 Score across 225 cross validations: 0.40561046292482145
This is a score that is slightly worse than random guesses.
my_pipeline = Pipeline([
    ("Preprocessing", ct),
    ("LogReg", LogisticRegression(solver = "saga", max_iter = 2500, penalty = "elasticnet", C = 9, l1_ratio = .0001))
])

fitted_pipeline = my_pipeline.fit(X, y)

y_pred = fitted_pipeline.predict(X)

cm = confusion_matrix(y_true = y, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=['Actual Hybrid', 'Actual Indica'], columns=['Predicted Hybrid', 'Predicted Indica'])

cm_df
Predicted Hybrid Predicted Indica
Actual Hybrid 1089 98
Actual Indica 273 158

Q4

The model that did the best at distinguishing between the two groups was the SVC model distinguishing between Indica and Sativa. The model that was the worst at distinguishing between the two groups was the logistic regression model distinguishing between Indica and Hybrid. This does make intuitive sense as Indica and Sativa should be the easiest for the model to distinguish between, due to them having different traits on average, while Hybrid and Indica can share many traits and Hybrid is the most difficult to accuractely categorize.

Q5

If you had input the full data with three classes into the LogisticRegression function, sklearn would have used OvR by default. For SVC, sklearn would use OvO by default.